Data Management II

Learning objectives

By the end of the lab, you will be able to …

  • x
  • y

Packages

Load the standard packages.

library(here)
library(tidyverse) 
library(haven) # not core tidyverse
library(gssr)
library(gssrdoc)
library(summarytools)

GSS Panel Data: Download

https://gss.norc.org/get-the-data/stata


Heads Up!

Save and unzip this file in your class data folder.

GSS Panel Data: Load

# Use here() to construct the file path
gss_panel.dta <- here("data", "GSS_2020_panel_stata_1a/gss2020panel_r1a.dta")

#load the data using `haven::read_dta()`
data <- read_dta(gss_panel.dta)

# Or, do both at the same time!
# data <- read_dta(here("data", "GSS_2020_panel_stata_1a/gss2020panel_r1a.dta"))

GSS 2016-2020 Panel Dataset

Study of former 2016 and 2018 GSS respondents were interviewed again in 2020

  • Variables from 2016 (Wave 1a) have _1a appended
  • Variables from 2018 (Wave 1b) have _1b appended
  • Variables from 2020 (Wave 2) have _2 appended

GSS 2016-2020 Panel Dataset

# A tibble: 10 × 7
     yearid year_1a year_1b year_2 age_1a    age_1b    age_2    
      <dbl>   <dbl>   <dbl>  <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
 1 20182183      NA    2018     NA NA(i)        52     NA(i)    
 2 20180711      NA    2018     NA NA(i)        19     NA(i)    
 3 20182189      NA    2018   2020 NA(i)        37        39    
 4 20160354    2016      NA     NA    56     NA(i)     NA(i)    
 5 20180452      NA    2018   2020 NA(i)        29        31    
 6 20181503      NA    2018   2020 NA(i)        58        60    
 7 20162744    2016      NA   2020    71     NA(i)        75    
 8 20160315    2016      NA   2020    69     NA(i)        73    
 9 20160170    2016      NA     NA    75     NA(i)     NA(i)    
10 20161888    2016      NA     NA    71     NA(i)     NA(i)    

Manipulating Dataframes

Select variables that match a pattern

These selection helpers match variables according to a given pattern.

  • starts_with(): Starts with an exact prefix.
  • ends_with(): Ends with an exact suffix.
  • contains(): Contains a literal string.
my_data <- data |>
  select(yearid, wtssnr_2, 
         starts_with("age_"), 
         starts_with("family16_"),
         starts_with("socfrend_"),
         starts_with("childs_")) 


# You can supply multiple prefixes or suffixes.
my_data <- data |>
  select(yearid, wtssnr_2, 
         starts_with(c("age_", "family16_", "socfrend", "childs"))
         )

my_data <- as_factor(my_data) # Apply labels to data

head() & tail()

Look at the first few column names and first few rows.

head(my_data)
# A tibble: 6 × 14
    yearid wtssnr_2 age_1a age_1b age_2 family16_1a       family16_1b family16_2
     <dbl>    <dbl> <fct>  <fct>  <fct> <fct>             <fct>       <fct>     
1 20160001    1.44  47     <NA>   51    both own mother … <NA>        not avail…
2 20160002    0.722 61     <NA>   65    both own mother … <NA>        not avail…
3 20160003   NA     72     <NA>   <NA>  both own mother … <NA>        iap       
4 20160004    2.89  43     <NA>   47    mother only       <NA>        not avail…
5 20160005   NA     55     <NA>   <NA>  both own mother … <NA>        iap       
6 20160006   NA     53     <NA>   <NA>  other             <NA>        iap       
# ℹ 6 more variables: socfrend_1a <fct>, socfrend_1b <fct>, socfrend_2 <fct>,
#   childs_1a <fct>, childs_1b <fct>, childs_2 <fct>


Look at the first few column names and last few rows.

tail(my_data)
# A tibble: 6 × 14
    yearid wtssnr_2 age_1a age_1b age_2 family16_1a family16_1b       family16_2
     <dbl>    <dbl> <fct>  <fct>  <fct> <fct>       <fct>             <fct>     
1 20182343   NA     <NA>   19     <NA>  <NA>        father and stepm… iap       
2 20182344   NA     <NA>   37     <NA>  <NA>        mother and stepf… iap       
3 20182345    0.995 <NA>   75     77    <NA>        both own mother … not avail…
4 20182346    0.995 <NA>   67     70    <NA>        both own mother … not avail…
5 20182347   NA     <NA>   72     <NA>  <NA>        both own mother … iap       
6 20182348   NA     <NA>   79     <NA>  <NA>        both own mother … iap       
# ℹ 6 more variables: socfrend_1a <fct>, socfrend_1b <fct>, socfrend_2 <fct>,
#   childs_1a <fct>, childs_1b <fct>, childs_2 <fct>

Reminder: Tidy data

This data is NOT tidy!
Some column names include values of a variable (survey year).

# A tibble: 15,645 × 7
     yearid wtssnr_2 panel age   family16                    socfrend     childs
      <dbl>    <dbl> <chr> <fct> <fct>                       <fct>        <fct> 
 1 20160001    1.44  1a    47    both own mother and father  several tim… 3     
 2 20160001    1.44  1b    <NA>  <NA>                        <NA>         <NA>  
 3 20160001    1.44  2     51    not available for this year several tim… 3     
 4 20160002    0.722 1a    61    both own mother and father  several tim… 0     
 5 20160002    0.722 1b    <NA>  <NA>                        <NA>         <NA>  
 6 20160002    0.722 2     65    not available for this year about once … 0     
 7 20160003   NA     1a    72    both own mother and father  <NA>         2     
 8 20160003   NA     1b    <NA>  <NA>                        <NA>         <NA>  
 9 20160003   NA     2     <NA>  iap                         <NA>         <NA>  
10 20160004    2.89  1a    43    mother only                 once or twi… 4     
# ℹ 15,635 more rows

This data is tidy!
Each variable is in its own column, and each observation is in its own row.

pivot_longer()

my_data_long <- my_data |>
  pivot_longer(
    cols = 3:14,
    names_to = c("variable"),
    values_to = "value")

head(my_data_long)
1
cols = specifies which columns you want to turn into one
2
names_to = defines the name of the new variable containing the current variable names
3
values_to= defines the name of the new variable that takes in the values of the variables
# A tibble: 6 × 4
    yearid wtssnr_2 variable    value                      
     <dbl>    <dbl> <chr>       <fct>                      
1 20160001     1.44 age_1a      47                         
2 20160001     1.44 age_1b      <NA>                       
3 20160001     1.44 age_2       51                         
4 20160001     1.44 family16_1a both own mother and father 
5 20160001     1.44 family16_1b <NA>                       
6 20160001     1.44 family16_2  not available for this year

This is also not tidy data!
The variable column contains the names of the variable AND the panel.

separate()

my_data_long <- my_data |>
  pivot_longer(
    cols = c(-yearid, -wtssnr_2),
    names_to = c("variable"),
    values_to = "value") |>
  separate_wider_delim(variable,
                       delim = "_",
                       names = c("variable", "panel"))

head(my_data_long)
1
Split the column variable into two using a delimiter
2
A string giving the delimiter between values
3
names specifies the two new column names
# A tibble: 6 × 5
    yearid wtssnr_2 variable panel value                      
     <dbl>    <dbl> <chr>    <chr> <fct>                      
1 20160001     1.44 age      1a    47                         
2 20160001     1.44 age      1b    <NA>                       
3 20160001     1.44 age      2     51                         
4 20160001     1.44 family16 1a    both own mother and father 
5 20160001     1.44 family16 1b    <NA>                       
6 20160001     1.44 family16 2     not available for this year

This is still not tidy data!
The value variable is a mix of different types of values

pivot_wider()

my_data <- my_data |> # overwriting my_data
  pivot_longer(
    cols = c(-yearid, -wtssnr_2),
    names_to = c("variable"),
    values_to = "value") |>
    separate_wider_delim(variable, 
                         delim = "_", 
                         names = c("variable", "panel")) |>
  pivot_wider(
    names_from = variable,
    values_from = value)

head(my_data)
1
Increasing the number of columns and decreasing the number of rows
2
Which column to get the name of the output columns
3
Which column to get the cell values from
# A tibble: 6 × 7
    yearid wtssnr_2 panel age   family16                    socfrend      childs
     <dbl>    <dbl> <chr> <fct> <fct>                       <fct>         <fct> 
1 20160001    1.44  1a    47    both own mother and father  several time… 3     
2 20160001    1.44  1b    <NA>  <NA>                        <NA>          <NA>  
3 20160001    1.44  2     51    not available for this year several time… 3     
4 20160002    0.722 1a    61    both own mother and father  several time… 0     
5 20160002    0.722 1b    <NA>  <NA>                        <NA>          <NA>  
6 20160002    0.722 2     65    not available for this year about once a… 0     

Recode the reshaped variable

my_data <- my_data |>
  mutate(panel = case_when(
         panel == "1a" ~ 2016,
         panel == "1b" ~ 2018,
         panel == "2" ~ 2020,
         TRUE ~ NA_integer_))

head(my_data)
# A tibble: 6 × 7
    yearid wtssnr_2 panel age   family16                    socfrend      childs
     <dbl>    <dbl> <dbl> <fct> <fct>                       <fct>         <fct> 
1 20160001    1.44   2016 47    both own mother and father  several time… 3     
2 20160001    1.44   2018 <NA>  <NA>                        <NA>          <NA>  
3 20160001    1.44   2020 51    not available for this year several time… 3     
4 20160002    0.722  2016 61    both own mother and father  several time… 0     
5 20160002    0.722  2018 <NA>  <NA>                        <NA>          <NA>  
6 20160002    0.722  2020 65    not available for this year about once a… 0     

Heads Up!

family16 is a time-invariant variable.

relocate()

my_data <- my_data |> 
  relocate(panel)

names(my_data)
[1] "panel"    "yearid"   "wtssnr_2" "age"      "family16" "socfrend" "childs"  
my_data <- my_data |> 
  relocate(panel, .after = yearid)

names(my_data)
[1] "yearid"   "panel"    "wtssnr_2" "age"      "family16" "socfrend" "childs"  


# A tibble: 15,645 × 7
     yearid panel wtssnr_2 age   family16                    socfrend     childs
      <dbl> <dbl>    <dbl> <fct> <fct>                       <fct>        <fct> 
 1 20160001  2016    1.44  47    both own mother and father  several tim… 3     
 2 20160001  2018    1.44  <NA>  <NA>                        <NA>         <NA>  
 3 20160001  2020    1.44  51    not available for this year several tim… 3     
 4 20160002  2016    0.722 61    both own mother and father  several tim… 0     
 5 20160002  2018    0.722 <NA>  <NA>                        <NA>         <NA>  
 6 20160002  2020    0.722 65    not available for this year about once … 0     
 7 20160003  2016   NA     72    both own mother and father  <NA>         2     
 8 20160003  2018   NA     <NA>  <NA>                        <NA>         <NA>  
 9 20160003  2020   NA     <NA>  iap                         <NA>         <NA>  
10 20160004  2016    2.89  43    mother only                 once or twi… 4     
# ℹ 15,635 more rows

arrange()

my_data |> 
  arrange(panel) |>
  select(yearid, panel, age, family16)
# A tibble: 15,645 × 4
     yearid panel age   family16                  
      <dbl> <dbl> <fct> <fct>                     
 1 20160001  2016 47    both own mother and father
 2 20160002  2016 61    both own mother and father
 3 20160003  2016 72    both own mother and father
 4 20160004  2016 43    mother only               
 5 20160005  2016 55    both own mother and father
 6 20160006  2016 53    other                     
 7 20160007  2016 50    both own mother and father
 8 20160008  2016 23    both own mother and father
 9 20160009  2016 45    both own mother and father
10 20160010  2016 71    both own mother and father
# ℹ 15,635 more rows


my_data |> 
  arrange(desc(panel)) |>
  select(yearid, panel, age, family16)
# A tibble: 15,645 × 4
     yearid panel age   family16                   
      <dbl> <dbl> <fct> <fct>                      
 1 20160001  2020 51    not available for this year
 2 20160002  2020 65    not available for this year
 3 20160003  2020 <NA>  iap                        
 4 20160004  2020 47    not available for this year
 5 20160005  2020 <NA>  iap                        
 6 20160006  2020 <NA>  iap                        
 7 20160007  2020 <NA>  iap                        
 8 20160008  2020 27    not available for this year
 9 20160009  2020 49    not available for this year
10 20160010  2020 <NA>  iap                        
# ℹ 15,635 more rows